{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "NEW_PATH=\"data/cleaned/selective_lexicons/\"\n",
    "OLD_PATH=\"data/cleaned/custom_lexicons/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from glob import glob"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Geolocations\n",
    "\n",
    "```\n",
    "## Doesn't terminate. Takes too long\n",
    "\n",
    "cat ../custom_lexicons/all_geonames.txt ../custom_lexicons/cities.txt.results.txt ../custom_lexicons/location* ../custom_lexicons/cap.1000 | sort -f | uniq -i > geo-loc.txt\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['data/cleaned/custom_lexicons/location.country', 'data/cleaned/custom_lexicons/location', 'data/cleaned/custom_lexicons/cities.txt.results.txt', 'data/cleaned/custom_lexicons/cap.1000', 'data/cleaned/custom_lexicons/all_geonames.txt']\n"
     ]
    }
   ],
   "source": [
    "filenames = glob(\"data/cleaned/custom_lexicons/location*\") + [\n",
    "    \"data/cleaned/custom_lexicons/cities.txt.results.txt\",\n",
    "    \"data/cleaned/custom_lexicons/cap.1000\",\n",
    "    \"data/cleaned/custom_lexicons/all_geonames.txt\"\n",
    "]\n",
    "print filenames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data/cleaned/custom_lexicons/location.country Processed: 436\n",
      "data/cleaned/custom_lexicons/location Processed: 620620\n",
      "data/cleaned/custom_lexicons/cities.txt.results.txt Processed: 17930\n",
      "data/cleaned/custom_lexicons/cap.1000 Processed: 971\n",
      "data/cleaned/custom_lexicons/all_geonames.txt Processed: 8333112\n"
     ]
    }
   ],
   "source": [
    "big_set = set()\n",
    "for fn in filenames:\n",
    "    with open(fn) as fp:\n",
    "        print fn, \n",
    "        i = 0\n",
    "        for line in fp:\n",
    "            line = line.strip().title()\n",
    "            big_set.add(line)\n",
    "            i += 1\n",
    "        print \"Processed: %s\" % i\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import codecs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8659218\n"
     ]
    }
   ],
   "source": [
    "with open(\"data/cleaned/selective_lexicons/geo-loc.txt\", \"wb+\") as fp:\n",
    "    print len(big_set)\n",
    "    for k in big_set:\n",
    "        print >> fp, k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-rw-rw-r-- 1 entity entity 4.6K Sep 16 18:53 data/cleaned/custom_lexicons/cvg.cvg_platform\r\n",
      "-rw-rw-r-- 1 entity entity  63K Sep 16 18:53 data/cleaned/custom_lexicons/cvg.cvg_developer\r\n",
      "-rw-rw-r-- 1 entity entity 382K Sep 16 18:53 data/cleaned/custom_lexicons/cvg.computer_videogame\r\n",
      "-rw-rw-r-- 1 entity entity 6.5K Sep 16 18:53 data/cleaned/custom_lexicons/cap.1000\r\n",
      "-rw-rw-r-- 1 entity entity  44K Sep 16 18:53 data/cleaned/custom_lexicons/business.sponsor\r\n",
      "-rw-rw-r-- 1 entity entity 124K Sep 16 18:53 data/cleaned/custom_lexicons/business.consumer_product\r\n",
      "-rw-rw-r-- 1 entity entity  616 Sep 16 18:53 data/cleaned/custom_lexicons/business.consumer_company\r\n",
      "-rw-rw-r-- 1 entity entity  28K Sep 16 18:53 data/cleaned/custom_lexicons/business.brand\r\n",
      "-rw-rw-r-- 1 entity entity 2.1K Sep 16 18:53 data/cleaned/custom_lexicons/broadcast.tv_channel\r\n",
      "-rw-rw-r-- 1 entity entity  67K Sep 16 18:53 data/cleaned/custom_lexicons/book.newspaper\r\n",
      "-rw-rw-r-- 1 entity entity  44K Sep 16 18:53 data/cleaned/custom_lexicons/base.events.festival_series\r\n",
      "-rw-rw-r-- 1 entity entity 103K Sep 16 18:53 data/cleaned/custom_lexicons/award.award\r\n",
      "-rw-rw-r-- 1 entity entity  69K Sep 16 18:53 data/cleaned/custom_lexicons/automotive.model\r\n",
      "-rw-rw-r-- 1 entity entity 4.3K Sep 16 18:53 data/cleaned/custom_lexicons/automotive.make\r\n",
      "-rw-rw-r-- 1 entity entity 176K Sep 16 18:53 data/cleaned/custom_lexicons/architecture.museum\r\n",
      "-rw-rw-r-- 1 entity entity  36K Sep 16 18:53 data/cleaned/custom_lexicons/lastname.5000\r\n",
      "-rw-rw-r-- 1 entity entity 146K Sep 16 18:53 data/cleaned/custom_lexicons/internet.website\r\n",
      "-rw-rw-r-- 1 entity entity 183K Sep 16 18:53 data/cleaned/custom_lexicons/government.government_agency\r\n",
      "-rw-rw-r-- 1 entity entity  38K Sep 16 18:53 data/cleaned/custom_lexicons/firstname.5k\r\n",
      "-rw-rw-r-- 1 entity entity 3.6K Sep 16 18:53 data/cleaned/custom_lexicons/english.stop\r\n",
      "-rw-rw-r-- 1 entity entity 793K Sep 16 18:53 data/cleaned/custom_lexicons/education.university\r\n",
      "-rw-rw-r-- 1 entity entity 350K Sep 16 18:53 data/cleaned/custom_lexicons/transportation.road\r\n",
      "-rw-rw-r-- 1 entity entity 198K Sep 16 18:53 data/cleaned/custom_lexicons/time.recurring_event\r\n",
      "-rw-rw-r-- 1 entity entity  11K Sep 16 18:53 data/cleaned/custom_lexicons/time.holiday\r\n",
      "-rw-rw-r-- 1 entity entity 555K Sep 16 18:53 data/cleaned/custom_lexicons/sports.sports_team\r\n",
      "-rw-rw-r-- 1 entity entity  94K Sep 16 18:53 data/cleaned/custom_lexicons/sports.sports_league\r\n",
      "-rw-rw-r-- 1 entity entity 442K Sep 16 18:53 data/cleaned/custom_lexicons/product\r\n",
      "-rw-rw-r-- 1 entity entity  46K Sep 16 18:53 data/cleaned/custom_lexicons/people.family_name\r\n",
      "-rw-rw-r-- 1 entity entity  41K Sep 16 18:53 data/cleaned/custom_lexicons/lower.5000\r\n",
      "-rw-rw-r-- 1 entity entity 5.9K Sep 16 18:53 data/cleaned/custom_lexicons/location.country\r\n",
      "-rw-rw-r-- 1 entity entity 9.9M Sep 16 18:53 data/cleaned/custom_lexicons/location\r\n",
      "-rw-rw-r-- 1 entity entity 4.8K Sep 16 18:53 data/cleaned/custom_lexicons/venues\r\n",
      "-rw-rw-r-- 1 entity entity  33K Sep 16 18:53 data/cleaned/custom_lexicons/venture_capital.venture_funded_company\r\n",
      "-rw-rw-r-- 1 entity entity 663K Sep 16 18:53 data/cleaned/custom_lexicons/tv.tv_program\r\n",
      "-rw-rw-r-- 1 entity entity  29K Sep 16 18:53 data/cleaned/custom_lexicons/tv.tv_network\r\n",
      "-rw-rw-r-- 1 entity entity 171K Sep 16 20:34 data/cleaned/custom_lexicons/cities.txt.results.txt\r\n",
      "-rw-rw-r-- 1 entity entity 1.8M Sep 16 20:34 data/cleaned/custom_lexicons/movies.txt.results.txt\r\n",
      "-rw-rw-r-- 1 entity entity 935K Sep 16 20:34 data/cleaned/custom_lexicons/music_artists.txt.results.txt\r\n",
      "-rw-rw-r-- 1 entity entity 991K Sep 16 20:34 data/cleaned/custom_lexicons/music_artists.txt\r\n",
      "-rw-rw-r-- 1 entity entity 138K Sep 16 20:34 data/cleaned/custom_lexicons/tvshows.txt.results.txt\r\n",
      "-rw-rw-r-- 1 entity entity  48K Sep 16 20:34 data/cleaned/custom_lexicons/sportsteams.txt.results.txt\r\n",
      "-rw-rw-r-- 1 entity entity  404 Sep 16 20:34 data/cleaned/custom_lexicons/products.txt.results.txt\r\n",
      "-rw-rw-r-- 1 entity entity 697K Sep 16 20:34 data/cleaned/custom_lexicons/persons.txt.results.txt\r\n",
      "-rw-rw-r-- 1 entity entity 123M Sep 16 21:17 data/cleaned/custom_lexicons/all_geonames.txt\r\n",
      "-rw-rw-r-- 1 entity entity  25M Oct  1 21:57 data/cleaned/custom_lexicons/all_movie_titles.txt\r\n",
      "-rw-rw-r-- 1 entity entity  66M Oct  1 23:31 data/cleaned/custom_lexicons/musicartist_names.unique.txt\r\n",
      "-rw-rw-r-- 1 entity entity  22M Oct  1 23:31 data/cleaned/custom_lexicons/musicartist_namevariants.unique.txt\r\n",
      "-rw-rw-r-- 1 entity entity 1.2M Oct  2 00:55 data/cleaned/custom_lexicons/companynames.txt.results.txt\r\n",
      "-rw-rw-r-- 1 entity entity 1.5M Oct  2 01:01 data/cleaned/custom_lexicons/buildings.txt.results.txt\r\n"
     ]
    }
   ],
   "source": [
    "! ls -ltrh data/cleaned/custom_lexicons/*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from nltk import wordnet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "147306\n"
     ]
    }
   ],
   "source": [
    "print len([k for k in wordnet.wordnet.all_lemma_names()])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Companies\n",
    "```\n",
    "cat ../custom_lexicons/business.* ../custom_lexicons/companynames.txt.results.txt ../custom_lexicons/venture_capital.venture_funded_company | sort -f | uniq -i > companies.txt\n",
    "cat ../custom_lexicons/all_movie_titles.txt ../custom_lexicons/movies.txt.results.txt | sort -f | uniq -i > movies.txt\n",
    "\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "filenames = glob(\"data/cleaned/custom_lexicons/business*\") + [\n",
    "    \"data/cleaned/custom_lexicons/companynames.txt.results.txt\",\n",
    "    \"data/cleaned/custom_lexicons/venture_capital.venture_funded_company\",\n",
    "    \"data/cleaned/custom_lexicons/cvg.cvg_developer\"\n",
    "]\n",
    "print filenames\n",
    "\n",
    "big_set = set()\n",
    "for fn in filenames:\n",
    "    with open(fn) as fp:\n",
    "        print fn, \n",
    "        i = 0\n",
    "        for line in fp:\n",
    "            line = line.strip().title()\n",
    "            big_set.add(line)\n",
    "            i += 1\n",
    "        print \"Processed: %s\" % i\n",
    "\n",
    "        \n",
    "with open(\"data/cleaned/selective_lexicons/companies.txt\", \"wb+\") as fp:\n",
    "    print len(big_set)\n",
    "    for k in big_set:\n",
    "        print >> fp, k        "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Products"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "filenames = glob(\"data/cleaned/custom_lexicons/automotive.*\") + [\n",
    "    \"data/cleaned/custom_lexicons/cities.txt.results.txt\",\n",
    "    \"data/cleaned/custom_lexicons/cap.1000\",\n",
    "    \"data/cleaned/custom_lexicons/all_geonames.txt\"\n",
    "]\n",
    "print filenames"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
